import requests
import re

#按照诗人进行爬取
def parse_page(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36',
    }
    response = requests.get(url,headers)
    text = response.text
    titles = re.findall(r'<b>(.*?)</b>', text, re.DOTALL)  # 得到题目
    contents = re.findall(r'<div class="contson" id=.*?">(.*?)</div>', text, re.DOTALL)  # 得到题目
    contents_tags = re.findall(r'<div class="contson" .*?>(.*?)</div>', text, re.DOTALL)
    poems = []
    print(titles)
    for value in zip(titles, contents):
        title, content = value
        # title = (re.sub('[《》]', '', title))
        content = (re.sub('<.*?>|\[.\]|\(.*?\)', '', content))#去掉解释以及标签等
        content =(re.sub('\n', '', content))
        str="[{\'title\': \'"+title+"\', \'authors\': \'孟浩然\', \'dynasties\': \'〔唐代〕\', \'contents\': \'"+content+"\'}]"#记录格式一样
        poems.append(str)
    for poem  in poems:
        print(poem)
        print('---'*80)
    return poems

def writeintext(poems,w):
    for poem in poems:
        strpoem = ''.join(poem)
        w.write(strpoem)
        w.write("\n")


def main():
    w = open('孟浩然.txt', 'a', encoding='utf-8')
    for i in range(1, 11):
        # url= 'http://www.360doc.com/content/20/1113/07/53128774_945572579.shtml'
        url="https://so.gushiwen.cn/shiwens/default.aspx?page="+str(i)+"&tstr=&astr=%E5%AD%9F%E6%B5%A9%E7%84%B6&cstr=&xstr="

        poems = parse_page(url)
        writeintext(poems, w)
    w.close()

if __name__ == '__main__':
    main()